load data: 207 subjects, with 854 goals are included in the following analysis

goalRating_long_R <- read.csv("./output/goalRating_long_R.csv",stringsAsFactors = F)

indivDiffDf <- read.csv("./output/indivDiffDf.csv",stringsAsFactors = F)

goalDf_wide <- read.csv("./output/goalDf_wide.csv",stringsAsFactors = F)

Data Screening for goal representation assessment

Missing data

Check the number of missing data per variable, and below is the top 5 variables. Missing data is rare for all variables

# check the number of "I'm not sure" responses per varialbe
totalGoal <- nrow(goalRating_long_R)/35

goalRating_long_R %>%
  filter(is.na(rating)) %>%
  tabyl(variable) %>%
  mutate(percent = n/totalGoal) %>%
  arrange(desc(percent)) %>%
  head(5)
##                   variable n     percent
## 1              frequency_R 6 0.007025761
## 2            attainability 5 0.005854801
## 3  attractiveness_progress 4 0.004683841
## 4              basic_needs 4 0.004683841
## 5 attainment_maintenance_R 3 0.003512881

The “I’m not sure” response

“construal_level”,“approach_avoidance” and “attainment_maintenance” question have an option for “I’m not sure” because they ask subjects to categorilize their goals.

around 4% of the goals had “I’m not sure” as the response.

The modification for construal level question seemed to be sucessful!

# check the number of "I'm not sure" responses per varialbe
goalRating_long_R %>%
  filter(rating == 99) %>%
  tabyl(variable) %>%
  mutate(percent = n/totalGoal) %>%
  arrange(desc(percent))
##                   variable  n    percent
## 1     approach_avoidance_R 39 0.04566745
## 2 attainment_maintenance_R 39 0.04566745
## 3          construal_level 38 0.04449649

around 2.9% participants select the “I’m not sure” option for construal level for more than once.

# get the number of total subject
totalSub <- nrow(indivDiffDf)

# check the percentage of participants who selected "I'm not sure" for construal level more than once
goalRating_long_R %>%
  filter(rating == 99 & variable == "construal_level") %>%
  tabyl(id) %>%
  filter(n >1) %>%
  nrow()/totalSub
## [1] 0.02898551

The “not specified” response

temporal_duration, frequency and end_state_specificity question have an option for “not specified” because they ask about features that may not be applicable to all goals.

The end state specificity is not applicable to around 17% of the goals

# check the number of "not specified" responses per varialbe
goalRating_long_R %>%
  filter(rating == 999) %>%
  tabyl(variable) %>%
  mutate(percent = n/totalGoal) %>%
  arrange(desc(percent))
##                  variable   n    percent
## 1 end_state_specificity_R 144 0.16861827
## 2             frequency_R  60 0.07025761
## 3       temporal_duration  60 0.07025761

Transform all special cases to NAs

All “I’m not sure” and “not specified” responses will be treated as missing data.

# transform 99 & 999 to NAs
goalRating_long_R <- goalRating_long_R %>% 
  mutate(rating = replace(rating, rating == 99 | rating == 999, NA))

The number of claimed goals

Descriptive on the number of goals subject claimed to have prior to listing them

describe(goalDf_wide$total_goal)
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 207 4.92 2.63      4    4.56 1.48   1  15    14 1.44     1.97 0.18
breaks = (1:15)
goalDf_wide %>% 
  ggplot(aes(x = total_goal)) + 
  scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=breaks) + 
  geom_histogram(fill = "orange", 
                 colour = "black",
                 binwidth = 1) + 
  labs(x="Number of claimed goals", y="# of participants") +
  theme_classic(base_size = 18) 

The percentage of subjects who claimed having more than 5 goals: 24.6%

length(goalDf_wide$total_goal[goalDf_wide$total_goal>5])/totalSub
## [1] 0.2463768

Descriptive on the number of goals participants actual listed

describe(goalDf_wide$listNum)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 207 4.13 1.01      4    4.26 1.48   1   5     4 -0.79    -0.52 0.07
breaks <- (1:5)
goalDf_wide %>% 
  ggplot(aes(x = listNum)) + 
  scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=seq(1, 5, by = 1)) + 
  geom_histogram(fill = "orange", 
                 colour = "black",
                 binwidth = 1) + 
  labs(x="Number of listed goals", y="# of participants") +
  theme_classic(base_size = 18) 

number of people who listed 1 goal: 1

length(goalDf_wide$listNum[goalDf_wide$listNum == 1])
## [1] 1

descriptvie on the differences between the number of claimed goals and listed goals

diffNum <- goalDf_wide$total_goal - goalDf_wide$listNum
describe(diffNum)
##    vars   n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 207  0.8 2.12      0    0.46   0  -2  10    12 1.95     4.02 0.15
breaks <- (-2:10)
goalDf_wide %>% 
  ggplot(aes(x = diffNum)) + 
  scale_x_continuous(labels=scales::comma(breaks, accuracy = 1), breaks=breaks) + 
  geom_histogram(fill = "orange", 
                 colour = "black",
                 binwidth = 1) + 
  labs(x="Number of claimed goals - listed goals", y="# of participants") +
  theme_classic(base_size = 18) 

percentage of people who listed more goals than they claimed: 11.6%

length(diffNum[diffNum <0])/totalSub *100
## [1] 11.5942

percentage of people who listed less goals than they claimed: 27%

length(diffNum[diffNum >0])/totalSub *100
## [1] 27.05314

Even if both the median and the mad of the difference is 0, around 38% of the participants either had to pick 5 out of all of their goals or came up some goals on the spot. Need to be aware of the priming or the order effect.

Goal Representation Ratings

Descriptive stats

# descriptive stats for each variable 
goalRating_long_R %>%
  dplyr::select(variable, rating) %>%
  group_by(variable) %>%
  summarize(mean = mean(rating, na.rm = TRUE),
            sd = sd(rating, na.rm = TRUE), 
            n = n(),
            min = min(rating, na.rm = TRUE),
            max = max(rating, na.rm = TRUE),
            skew = skew(rating, na.rm = T), 
            kurtosi = kurtosi(rating, na.rm = T)
            ) %>%
arrange(skew)
## # A tibble: 35 x 8
##    variable                    mean    sd     n   min   max   skew kurtosi
##    <chr>                      <dbl> <dbl> <int> <int> <int>  <dbl>   <dbl>
##  1 approach_avoidance_R        6.17 1.49    854     1     7 -1.99    3.26 
##  2 control                     6.28 1.04    854     1     7 -1.60    2.41 
##  3 identified_motivation       5.96 1.24    854     1     7 -1.32    1.69 
##  4 ideal_motivation            5.78 1.46    854     1     7 -1.17    0.732
##  5 importance                  6.05 1.16    854     1     7 -1.13    0.663
##  6 social_desirability         5.85 1.31    854     1     7 -1.13    0.872
##  7 attractiveness_achievement  6.12 0.995   854     2     7 -1.09    0.803
##  8 instrumentality             5.57 1.55    854     1     7 -1.08    0.613
##  9 commonality                 5.46 1.61    854     1     7 -0.999   0.309
## 10 temporal_duration           3.29 0.844   854     1     4 -0.981   0.123
## # … with 25 more rows
# order based on their skewness 
#kable(varDf[order(varDf$skew),])

The approach_avoidance has very little variance. After changing the anchors for both the attractiveness variables, their distributions are similar to other variables such as important and ideal_motivation. This time control is the most positively skewed other than approach_avoidance.

Should we change the approach_avoidance to ordinal?

# histograme for each dimension
goalRating_long_R %>%
  ggplot(aes(x = rating)) +
    geom_histogram(fill   = "orange",
                   colour = "black",
                   alpha  = .6) +
    facet_wrap(~variable, nrow = 7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

correlational matrix across all variables

“pairwise.complete.obs” is used for generating correlational matrix.The correlations make sense

# transform the long format to short format
goalDf_wide <- goalRating_long_R %>% spread (variable, rating)

# generate a correlational matrix
corrM_all <- goalDf_wide %>% 
  dplyr :: select(affordance:visibility) %>% 
  cor(use = "pairwise.complete.obs")

# visualization
corrplot(corrM_all, method = "circle",number.cex = .7, order = "AOE", addCoef.col = "black",type = "upper")

Variance Partition

Only the 30 variables for goal representation are included. Only around 7% of the variance is on the between subject level.

# subset the long format dataset for only the 30 goal representation variable
goal_striving <- c("initial_time_R", "advancement", "urgency", "effort", "commitment")
goalDf_R_long <- goalRating_long_R[!goalRating_long_R$variable %in% goal_striving,]

# generate a multilevel model with subject as the random intercept
mlm <-lmer(rating ~ variable + (1|id), data = goalDf_R_long)

# calculate the variance partition coefficient and transform to ICC
VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(icc=vcov/sum(vcov)) %>%
  dplyr :: select(grp, icc)
## # A tibble: 2 x 2
##   grp         icc
##   <chr>     <dbl>
## 1 id       0.0702
## 2 Residual 0.930
Raw <- VarCorr(mlm) %>%
  as_data_frame() %>%
  mutate(Raw=vcov/sum(vcov)) %>%
  dplyr :: select(Raw)
## Warning: `as_data_frame()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Exploritory Factor Analysis

Data transformation

27 varialbes are included. Ordinal variables are not included: “temporal_duration” & “end_state_specificity” and “frequency”

# Exclude the 5 varialbes related to goal striving progress
goalDf_R_wide <- goalDf_wide[,!names(goalDf_wide) %in% goal_striving]

# Exclude ordinal variables: temporal_duration & end_state_specificity and frequency and other columns with irrelevent data
goal_ordinal <- c("temporal_duration", "end_state_specificity_R", "frequency_R")
goalDf_EFA <- goalDf_R_wide[,!names(goalDf_R_wide) %in% goal_ordinal]
goalDf_EFA <- subset(goalDf_EFA, select = affordance : visibility)

# Generate a correlational matrix 
corrM_raw <- cor(goalDf_EFA, use = "pairwise")

Evaluate the number of factors to extract

Both the Very Simple Structure evaluation and parallel analysis recommend 5 factors. There are 7 factors have an eigen value > 1. Therefore, models with 5, 6, and 7 factors will be explored

# use Very Simple Structure criterion
vss(corrM_raw, n = 10, rotate = "promax", diagonal = FALSE, fm = "minres", 
n.obs=844,plot=TRUE,title="Very Simple Structure",use="pairwise",cor="cor")
## Loading required namespace: GPArotation

## 
## Very Simple Structure
## Call: vss(x = corrM_raw, n = 10, rotate = "promax", diagonal = FALSE, 
##     fm = "minres", n.obs = 844, plot = TRUE, title = "Very Simple Structure", 
##     use = "pairwise", cor = "cor")
## VSS complexity 1 achieves a maximimum of 0.59  with  1  factors
## VSS complexity 2 achieves a maximimum of 0.68  with  5  factors
## 
## The Velicer MAP achieves a minimum of 0.01  with  3  factors 
## BIC achieves a minimum of  -682.96  with  8  factors
## Sample Size adjusted BIC achieves a minimum of  -205.51  with  10  factors
## 
## Statistics by number of factors 
##    vss1 vss2   map dof chisq     prob sqresid  fit RMSEA  BIC  SABIC complex
## 1  0.59 0.00 0.018 324  3392  0.0e+00      25 0.59 0.106 1209 2237.7     1.0
## 2  0.48 0.60 0.014 298  2213 4.8e-289      25 0.60 0.087  205 1151.0     1.3
## 3  0.56 0.67 0.013 273  1634 2.0e-192      20 0.68 0.077 -205  661.8     1.4
## 4  0.55 0.65 0.014 249  1255 7.8e-134      18 0.71 0.069 -422  368.3     1.7
## 5  0.55 0.68 0.014 226   930  3.6e-86      16 0.74 0.061 -592  125.4     1.8
## 6  0.46 0.65 0.017 204   732  1.2e-60      18 0.70 0.055 -642    5.6     1.9
## 7  0.43 0.48 0.020 183   555  3.7e-39      23 0.62 0.049 -678  -96.6     1.8
## 8  0.43 0.52 0.022 163   415  5.7e-24      24 0.61 0.043 -683 -165.3     1.9
## 9  0.43 0.51 0.024 144   323  9.2e-16      23 0.62 0.038 -647 -190.0     1.8
## 10 0.42 0.49 0.028 126   243  1.7e-09      24 0.60 0.033 -606 -205.5     1.9
##    eChisq  SRMR eCRMS eBIC
## 1    6006 0.101 0.105 3823
## 2    3345 0.075 0.082 1337
## 3    1930 0.057 0.065   90
## 4    1144 0.044 0.052 -533
## 5     710 0.035 0.043 -812
## 6     533 0.030 0.039 -841
## 7     391 0.026 0.036 -842
## 8     276 0.022 0.032 -822
## 9     186 0.018 0.028 -784
## 10    121 0.014 0.024 -728
# Use the Scree plot to identify the number of factors have Eigenvalues >1 and the output from the Parallel analysis

ev <- eigen(corrM_raw)
ap <- parallel(subject=nrow(goalDf_EFA),var=ncol(goalDf_EFA),
  rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

Extract factors

Extract number of factors based on the suggestions above. Because we expect factors to be correlated with each other, we use “promax” rotation.

# extract 5 factors
fa_raw_5 <-fa(r=corrM_raw, nfactors=5,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")
#fa.sort(fa_raw_5)

# extract 6 factors
fa_raw_6 <-fa(r=corrM_raw, nfactors=6,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")

# extract 7 factors
fa_raw_7 <-fa(r=corrM_raw, nfactors=7,n.obs = 854, rotate="promax", SMC=FALSE, fm="minres")
#fa.sort(fa_raw_7)

Compare loadings for each model

5 factors

fa.diagram(fa_raw_5)

6 factors

fa.diagram(fa_raw_6)

7 factors

fa.diagram(fa_raw_7)

Compare model fit & complexity

# generate a dataframe 
fa_fitDf <- data.frame(factors = c(5,6,7),
                        chi = c(fa_raw_5$chi,fa_raw_6$chi,fa_raw_7$chi),
                        BIC = c(fa_raw_5$BIC,fa_raw_6$BIC,fa_raw_7$BIC),
                        fit = c(fa_raw_5$fit,fa_raw_6$fit,fa_raw_7$fit),
                        RMSEA = c(fa_raw_5$RMSEA[1],fa_raw_6$RMSEA[1],fa_raw_7$RMSEA[1]),
                       cumVar = c(max(fa_raw_5$Vaccounted[3,]), max(fa_raw_6$Vaccounted[3,]),max(fa_raw_7$Vaccounted[3,])),
                        complexity = c(mean(fa_raw_5$complexity),mean(fa_raw_6$complexity),mean(fa_raw_7$complexity)))

fa_fitDf
##   factors      chi       BIC       fit      RMSEA    cumVar complexity
## 1       5 718.8370 -583.7973 0.8182823 0.06088328 0.4009636   1.757290
## 2       6 539.7593 -635.8116 0.8318495 0.05551584 0.4221319   1.900502
## 3       7 395.3800 -673.1642 0.8448579 0.04923615 0.4442019   1.849738

The most parsimonious model is pretty interpretable. However, more than 1/3 of the variables are clustered on the first factor. This may not be ideal when we explore between subject level variance. The model with 7 factors seems to load (less cross loading) better and explain a little more variance. However, the interfactor correlation may be too high. In order to expand the exploration into the factor variance, I’ll go with 7 factors for now. The additional 2 factors are attractiveness and visibility

Another issue to consider is the mix of unipolar and bipolar. Among the 27 numeric variables, we have 10 bipolar variables: are social desirability, clarity, controllability, all measures about motivations, commonality. They load on different factors on both models, so it may not be a huge issue. ### Final model: 7 Factors

# organize loadings
loadings <- fa.sort(fa_raw_7)$loadings
loadings <- as.data.frame(unclass(loadings))
colnames(loadings) <-  c("importance", "ought", "attractiveness", "measuarability", "commonality", "attainability", "visibility")
loadings$Variables <- rownames(loadings)
loadings.m <- loadings %>% gather(-Variables, key = "Factor", value = "Loading")
colOrder <- c("importance", "ought", "attractiveness", "measuarability", "commonality", "attainability", "visibility")
rowOrder <- rev(rownames(loadings))
loadings.m<- arrange(mutate(loadings.m,Variables=factor(Variables,leve=rowOrder)),Variables)
loadings.m<- arrange(mutate(loadings.m,Factor=factor(Factor,leve=colOrder)),Factor)

# Visualization
ggplot(loadings.m, aes(Variables, abs(Loading), fill=Loading)) + 
  facet_wrap(~ Factor, nrow=1) + #place the factors in separate facets
  geom_bar(stat="identity") + #make the bars
  coord_flip() + #flip the axes so the test names can be horizontal  
  #define the fill color gradient: blue=positive, red=negative
  scale_fill_gradient2(name = "Loading", 
                       high = "orange", mid = "white", low = "blue", 
                       midpoint=0, guide="colourbar") +
  ylab("Loading Strength") + #improve y-axis label
  ggtitle("Loadings for 7 factors") + 
  theme_bw(base_size=10)

Alternative model: 5 Factors

# visualization
loadings <- fa.sort(fa_raw_5)$loadings
loadings <- as.data.frame(unclass(loadings))
colnames(loadings) <- c("importance", "ought", "measurability", "attainability", "commonality")
loadings$Variables <- rownames(loadings)
loadings.m <- loadings %>% gather(-Variables, key = "Factor", value = "Loading")
colOrder <- c("importance", "ought", "measurability", "attainability", "commonality")
rowOrder <- rev(rownames(loadings))
loadings.m<- arrange(mutate(loadings.m,Variables=factor(Variables,leve=rowOrder)),Variables)
loadings.m<- arrange(mutate(loadings.m,Factor=factor(Factor,leve=colOrder)),Factor)

ggplot(loadings.m, aes(Variables, abs(Loading), fill=Loading)) + 
  facet_wrap(~ Factor, nrow=1) + #place the factors in separate facets
  geom_bar(stat="identity") + #make the bars
  coord_flip() + #flip the axes so the test names can be horizontal  
  #define the fill color gradient: blue=positive, red=negative
  scale_fill_gradient2(name = "Loading", 
                       high = "orange", mid = "white", low = "dark blue", 
                       midpoint=0, guide="colourbar") +
  ylab("Loading Strength") + #improve y-axis label + 
  ggtitle("Loadings for 5 factors") + 
  theme_bw(base_size=10)

interfactor correlation:

corr_fa <- fa_raw_7$Phi %>% 
  as.data.frame() %>% 
  dplyr::rename(importance = MR1, ought = MR2, attractiveness = MR6, measuarability = MR3, commonality = MR7, attainability = MR5, visibility = MR4) %>%
  round(.,2)
  
rownames(corr_fa) <- c("importance", "ought", "attractiveness", "measuarability", "commonality", "attainability", "visibility")
corr_fa
##                importance ought attractiveness measuarability commonality
## importance           1.00  0.37           0.56           0.40        0.58
## ought                0.37  1.00          -0.03           0.10        0.46
## attractiveness       0.56 -0.03           1.00           0.14        0.39
## measuarability       0.40  0.10           0.14           1.00        0.20
## commonality          0.58  0.46           0.39           0.20        1.00
## attainability        0.16  0.14           0.17           0.14        0.04
## visibility           0.27  0.23           0.19           0.09       -0.01
##                attainability visibility
## importance              0.16       0.27
## ought                   0.14       0.23
## attractiveness          0.17       0.19
## measuarability          0.14       0.09
## commonality             0.04      -0.01
## attainability           1.00       0.00
## visibility              0.00       1.00

Factor scores

Generate factors scores using simple weights (0 & 1)

Generate the factor score based on the mean

factorScoreDf <- goalDf_R_wide %>%
  mutate(Importance = rowMeans(select(.,instrumentality, ideal_motivation, connectedness, meaningfulness, identified_motivation, importance, construal_level),na.rm = T),
         Ought = rowMeans(select(., ought_motivation, external_motivation, external_importance , introjected_motivation), na.rm = T),
         Attractiveness = rowMeans(select(., attractiveness_achievement, attractiveness_progress, intrinsic_motivation), na.rm = T),
         Measurability = rowMeans(select(., measurability , clarity , specificity , control , approach_avoidance_R), na.rm = T),
         Commonality = rowMeans(select(., commonality , social_desirability, basic_needs), na.rm = T),
         Attainability = rowMeans(select(., difficulty , attainability , affordance), na.rm = T),
         Visibility = rowMeans(select(., visibility , attainment_maintenance_R), na.rm = T)) %>%
  select(id, list_goal = listNum, claim_goal = total_goal, goal_order = goal, Importance, Ought, Attractiveness, Measurability, Commonality, Attainability, Visibility) %>%
  mutate_all(round,2)

variation partition on factor scores

# Importance 
mlm <-lmer(Importance ~ 1 + (1|id), data = factorScoreDf)

Importance <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Importance=vcov/sum(vcov)) %>%
  dplyr :: select(Importance)

# Ought 
mlm <-lmer(Ought ~ 1 + (1|id), data = factorScoreDf)

Ought <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Ought=vcov/sum(vcov)) %>%
  dplyr :: select(Ought)

# Attractiveness 
mlm <-lmer(Attractiveness ~ 1 + (1|id), data = factorScoreDf)

Attractiveness <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Attractiveness=vcov/sum(vcov)) %>%
  dplyr :: select(Attractiveness)

# Measurability 
mlm <-lmer(Measurability ~ 1 + (1|id), data = factorScoreDf)

Measurability <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Measurability=vcov/sum(vcov)) %>%
  dplyr :: select(Measurability)

# Commonality 
mlm <-lmer(Commonality ~ 1 + (1|id), data = factorScoreDf)

Commonality <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Commonality=vcov/sum(vcov)) %>%
  dplyr :: select(Commonality)

# Attainability 
mlm <-lmer(Attainability ~ 1 + (1|id), data = factorScoreDf)

Attainability <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Attainability=vcov/sum(vcov)) %>%
  dplyr :: select(Attainability)

# Visibility 
mlm <-lmer(Visibility ~ 1 + (1|id), data = factorScoreDf)

Visibility <- VarCorr(mlm) %>%
  as_tibble() %>%
  mutate(Visibility=vcov/sum(vcov)) %>%
  dplyr :: select(Visibility)

# combine the outputs into one data frame
factorScore_icc <- data.frame("variation" = c("between subject", "within subject"))
factorScore_icc <- bind_cols(factorScore_icc, Importance, Ought,Attractiveness, Measurability,Commonality, Attainability, Visibility, Raw)

# visualization
factorV <- factorScore_icc %>% gather(-variation, key = factor, value = ICC)
ggplot(factorV, aes(fill=variation, y=ICC, x=factor)) + 
  geom_bar(position="stack", stat="identity", alpha = .8) +
  scale_fill_manual(values=c("dark blue", "orange")) + 
  ggtitle("ICC output")+ 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

The correlations between factor scores and goal progress variables

# combine goal level data
goalLevelDf <- cbind(select(factorScoreDf, "goal_order" : "Commonality"), goalDf_wide[,c("advancement", "commitment", "effort", "urgency", "initial_time_R")])

# generate a correlational matrix 
corrM_goalLevel <- cor(goalLevelDf,use = "pairwise")

# visualization
corrplot(corrM_goalLevel, method = "circle",number.cex = .7, order = "AOE", addCoef.col = "black",type = "upper")

subject level aggregated factor scores

# calculate the average score for each factor within each subject
subjectDf_factor <- aggregate(. ~ id, data = factorScoreDf, mean)

# calculate the sd for each factor wihtin each subject
subjectDf_factor_sd <- aggregate(. ~ id, data = factorScoreDf, sd)
subjectDf_factor_sd <- select(subjectDf_factor_sd, -c("list_goal", "claim_goal", "goal_order")) 

Mahalanobis distance

This session is for calculaing pair-wise distance between goals within each subject. Because the dimensions are not orthogonal, we use Mahalanobis distance to estimate the distance across all dimensions.

pair-wise distance

We use the covariance matrix across all goal ratings to calculate pair-wise distance. Subjects who only have 1 goal are excluded from this analysis

# set a function for calculating pairwise distiance
mahalanobisFun <- function(df, cov) { 
  MD <- combn(nrow(df), 2, function(x) mahalanobis(as.matrix(df[x[1],]), center = as.matrix(df[x[2],]), cov = cov))
  return(tryCatch(MD, error=function(e) NULL))
}

Exclude subjects with only one goal

# exclude subjects with only one goal
id_oneGoal <- goalDf_wide$id[goalDf_wide$listNum ==1]
factorScoreDf_clean <- factorScoreDf %>% filter(!id %in% id_oneGoal)

# split the dataset by IDs and then get rid off the ID column
splitDf <- split( factorScoreDf_clean, f = factorScoreDf_clean$id)
#splitDf <- split( factorScoreDf, f = factorScoreDf$id)
splitDf <- lapply(splitDf, function(x) subset(x, select = -id))

# get the covariance matrix on factor scores across all goals
factor_cov <- cov(subset(factorScoreDf_clean, select = -id))
#factor_cov <- cov(subset(factorScoreDf, select = -id))

# apply the distance function to each subject
output <- lapply(splitDf, function(x) mahalanobisFun(x, factor_cov))

Average number of pairs per subject:

# extract distance values
distance_M <- unlist(output)

# extract the number of pairs per subject
pairNum <- lapply(output, function(x) length(as.vector(x)))
pairNum <- unlist(pairNum)
mean(pairNum)
## [1] 6.985437
# generate a pairwise data frame
id <- unique(factorScoreDf_clean$id)
id_pair <- unlist(mapply(rep, id, pairNum))

# pairId <- unlist(mapply(seq,1,pairNum))

pairDf_M <- data.frame("subject_id" = id_pair,
                     "distance_M" = distance_M)

descriptive

# descriptive of all pairwise distance
describe(pairDf_M$distance_M)
##    vars    n  mean   sd median trimmed  mad min   max range skew kurtosis   se
## X1    1 1439 13.97 8.11  12.44   13.09 7.19 0.9 50.56 49.66 1.13     1.58 0.21
pairDf_M %>% ggplot(aes(x = distance_M)) +
    geom_histogram(fill   = "orange",
                   colour = "black",
                   alpha  = .6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

variance partition

Around 27% of the variance in distance is on between subject level

# generate a multilevel model with subject as the random intercept
mlm <-lmer(distance_M ~ 1 + (1|subject_id), data = pairDf_M)

# calculate the variance partition coefficient and transform to ICC
VarCorr(mlm) %>%
  as_data_frame() %>%
  mutate(icc=vcov/sum(vcov)) %>%
  dplyr :: select(grp, icc)
## # A tibble: 2 x 2
##   grp          icc
##   <chr>      <dbl>
## 1 subject_id 0.274
## 2 Residual   0.726

subject-level distance (mean & sd)

15 subjects only have 1 pair

# calculate mean distance per subject
distDf_perSub_M <- pairDf_M %>%
  group_by(subject_id) %>%
  mutate(distMean = mean(distance_M),
         distSd = sd(distance_M)) %>%
  dplyr :: select(-distance_M)

distDf_perSub_M <- distDf_perSub_M[!duplicated(distDf_perSub_M$subject_id),]

# descriptive of subject-level mean distance
describe(distDf_perSub_M$distMean)
##    vars   n  mean   sd median trimmed  mad  min   max range skew kurtosis  se
## X1    1 206 13.73 5.76  12.83   13.22 4.55 1.51 42.47 40.97 1.16     2.83 0.4
distDf_perSub_M %>% ggplot(aes(x = distMean)) +
    geom_histogram(fill   = "orange",
                   colour = "black",
                   alpha  = .6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# descriptive of subject-level distance sd
describe(distDf_perSub_M$distSd)
##    vars   n mean   sd median trimmed  mad min   max range skew kurtosis   se
## X1    1 191 6.17 2.99   5.24    5.87 2.36 0.6 20.38 19.78 1.26     2.71 0.22
distDf_perSub_M %>% ggplot(aes(x = distSd)) +
    geom_histogram(fill   = "orange",
                   colour = "black",
                   alpha  = .6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).

subject level correlations

with factor scores

# combine subject level data frame (individual differences, mean factor scores, mean distance)
subjectDf <- left_join(subjectDf_factor, distDf_perSub_M, by = c("id" = "subject_id"))
subjectDf <- left_join(subjectDf, indivDiffDf, by = "id") %>% select(-goal_order)

# generate a correlational matrix 
corrM_subjectRaw <- cor(subjectDf[,-1],use = "pairwise")

as.data.frame(corrM_subjectRaw[1:11,10:30]) %>% round(2)
##                distMean distSd Extraversion_mean Agreeableness_mean
## list_goal          0.10   0.16             -0.13               0.05
## claim_goal         0.16   0.19             -0.13               0.08
## Importance        -0.25  -0.09              0.12               0.11
## Ought             -0.19  -0.15             -0.02               0.00
## Attractiveness    -0.22  -0.13              0.20               0.17
## Measurability     -0.09  -0.03              0.09               0.07
## Commonality       -0.28  -0.21              0.06               0.08
## Attainability     -0.17  -0.17              0.12               0.08
## Visibility        -0.01   0.14              0.03              -0.02
## distMean           1.00   0.79              0.04              -0.05
## distSd             0.79   1.00             -0.05              -0.11
##                Conscientiousness_mean Neuroticism_mean OpenMindedness_mean
## list_goal                       -0.06             0.08                0.11
## claim_goal                      -0.11             0.09                0.12
## Importance                       0.05            -0.04                0.18
## Ought                           -0.06             0.04                0.09
## Attractiveness                   0.13            -0.14                0.10
## Measurability                    0.01            -0.05                0.02
## Commonality                      0.00            -0.12                0.08
## Attainability                    0.06            -0.27                0.00
## Visibility                       0.02            -0.07               -0.03
## distMean                         0.04             0.04                0.07
## distSd                           0.01             0.02                0.09
##                BSCS_mean GOS_learning GOS_avoidance GOS_prove GSE_mean LET_mean
## list_goal           0.07         0.08         -0.14     -0.02    -0.07    -0.03
## claim_goal         -0.04         0.16         -0.13     -0.04     0.05    -0.07
## Importance          0.04         0.18         -0.02      0.23     0.18     0.20
## Ought              -0.04        -0.10          0.24      0.19     0.03    -0.11
## Attractiveness      0.09         0.23         -0.05      0.24     0.23     0.30
## Measurability       0.11         0.24         -0.08      0.08     0.18     0.28
## Commonality        -0.07        -0.06          0.02      0.13     0.15     0.18
## Attainability       0.23         0.24         -0.13     -0.02     0.42     0.23
## Visibility          0.06        -0.04          0.03     -0.01     0.04     0.06
## distMean            0.01         0.14         -0.02     -0.08     0.06    -0.02
## distSd              0.04         0.18         -0.06     -0.13     0.02    -0.09
##                PS_mean RSE_mean SWL_mean family_mean competetion_mean
## list_goal        -0.03    -0.07     0.00        0.03             0.02
## claim_goal       -0.06    -0.10    -0.02        0.01            -0.03
## Importance        0.10     0.05     0.09        0.18             0.07
## Ought            -0.02    -0.20    -0.11        0.14             0.10
## Attractiveness    0.12     0.15     0.21        0.11            -0.07
## Measurability     0.17     0.17     0.12        0.18             0.06
## Commonality       0.09     0.10     0.10        0.05            -0.02
## Attainability     0.14     0.21     0.14        0.03            -0.10
## Visibility        0.08     0.03    -0.04        0.14            -0.01
## distMean          0.12     0.00    -0.11        0.03             0.18
## distSd            0.01     0.03    -0.05        0.00             0.15
##                appearance_mean god_mean academic_mean
## list_goal                -0.01     0.09          0.13
## claim_goal                0.01    -0.02          0.10
## Importance               -0.05     0.18          0.18
## Ought                     0.00     0.10          0.15
## Attractiveness           -0.10     0.17          0.05
## Measurability             0.03     0.13          0.12
## Commonality              -0.07     0.20          0.04
## Attainability            -0.05    -0.03         -0.11
## Visibility                0.01     0.06          0.07
## distMean                  0.08    -0.12         -0.01
## distSd                    0.07    -0.06          0.06

with factor variation

# combine subject level data frame (individual differences, variance in factor scores, mean and sd in distance)
subjectDf_sd <- left_join(subjectDf_factor_sd, distDf_perSub_M, by = c("id" = "subject_id"))
subjectDf_sd <- left_join(subjectDf_sd, indivDiffDf, by = "id")

# generate a correlational matrix 
corrM_subjectSd <- cor(subjectDf_sd[,-1],use = "pairwise")

as.data.frame(corrM_subjectSd[1:9,8:28]) %>% round(2)
##                distMean distSd Extraversion_mean Agreeableness_mean
## Importance         0.55   0.40             -0.12              -0.04
## Ought              0.48   0.30             -0.14              -0.14
## Attractiveness     0.43   0.39             -0.03               0.07
## Measurability      0.41   0.29             -0.02              -0.13
## Commonality        0.48   0.30              0.07               0.02
## Attainability      0.40   0.29              0.10               0.14
## Visibility         0.54   0.20              0.08              -0.07
## distMean           1.00   0.79              0.04              -0.05
## distSd             0.79   1.00             -0.05              -0.11
##                Conscientiousness_mean Neuroticism_mean OpenMindedness_mean
## Importance                       0.01             0.02               -0.05
## Ought                            0.01             0.00               -0.03
## Attractiveness                  -0.02             0.03                0.00
## Measurability                   -0.01            -0.01                0.01
## Commonality                      0.03             0.03                0.13
## Attainability                   -0.08            -0.02                0.12
## Visibility                       0.08             0.06                0.00
## distMean                         0.04             0.04                0.07
## distSd                           0.01             0.02                0.09
##                BSCS_mean GOS_learning GOS_avoidance GOS_prove GSE_mean LET_mean
## Importance         -0.01         0.07         -0.06     -0.12     0.01    -0.01
## Ought              -0.05         0.00          0.05      0.00    -0.06     0.05
## Attractiveness      0.04        -0.10         -0.03     -0.18    -0.04    -0.09
## Measurability      -0.05         0.04         -0.11     -0.09     0.03    -0.16
## Commonality         0.05         0.22         -0.06      0.02     0.04     0.03
## Attainability      -0.06        -0.06          0.03      0.07    -0.02     0.15
## Visibility          0.02         0.10         -0.07      0.03     0.01    -0.03
## distMean            0.01         0.14         -0.02     -0.08     0.06    -0.02
## distSd              0.04         0.18         -0.06     -0.13     0.02    -0.09
##                PS_mean RSE_mean SWL_mean family_mean competetion_mean
## Importance        0.04     0.00     0.01       -0.02             0.04
## Ought             0.03    -0.02    -0.07       -0.08             0.04
## Attractiveness    0.04    -0.10    -0.18       -0.05            -0.03
## Measurability    -0.01    -0.04    -0.04        0.02             0.16
## Commonality       0.07     0.00    -0.01        0.10             0.06
## Attainability     0.03     0.04     0.08        0.14             0.03
## Visibility        0.14    -0.01    -0.11        0.00             0.06
## distMean          0.12     0.00    -0.11        0.03             0.18
## distSd            0.01     0.03    -0.05        0.00             0.15
##                appearance_mean god_mean academic_mean
## Importance                0.06    -0.10          0.01
## Ought                    -0.07    -0.11         -0.11
## Attractiveness            0.04    -0.08         -0.05
## Measurability             0.04    -0.10          0.00
## Commonality               0.01    -0.01          0.00
## Attainability             0.08     0.07          0.09
## Visibility                0.16    -0.10         -0.03
## distMean                  0.08    -0.12         -0.01
## distSd                    0.07    -0.06          0.06